import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#For splitting data
from sklearn.model_selection import train_test_split
#For scaling the data to remove the importance of units
from sklearn.preprocessing import StandardScaler
# To calculate the accuracy score of the model
from sklearn.metrics import accuracy_score, confusion_matrix
#importing file into a dataset
ds_vehicle= pd.read_csv("vehicle-1.csv")
#general information about data
ds_vehicle.head(10)
ds_vehicle.info()
ds_vehicle.shape
#Checking for null values
print("Any null values in the dataset :",ds_vehicle.isnull().values.any())
ds_vehicle.isnull().sum()
#List all the rows having missing value in any of the single or multiple columns
#Columns having missing values
missing_values_cols=ds_vehicle.columns[ds_vehicle.isnull().any()]
ds_vehicle[ds_vehicle.isnull().any(axis=1)][missing_values_cols].head()
#Label encode the target class
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
ds_vehicle["class"] = labelencoder.fit_transform(ds_vehicle['class'])
ds_vehicle["class"].value_counts()
#Missing treatment for circularity
ds_vehicle[ds_vehicle['circularity'].isnull()][missing_values_cols]
#Dropping rows 105,118 and 266 since they have multiple NaN values
ds_vehicle.drop([105,118,266], inplace=True)
ds_vehicle.loc[5].loc['class'],ds_vehicle.loc[396].loc['class']
#Since from the data we observe both of the remaining rows are of the catefory "bus"
#we will be replacing it with the mean of the same
ds_vehicle['circularity'].fillna(ds_vehicle['circularity'][ds_vehicle['class']==0].median(), inplace=True)
ds_vehicle[ds_vehicle['circularity'].isnull()][missing_values_cols]
#Moving to distance_circularity, we repeat the same process for the rest of the missing values in differnt attributes
ds_vehicle[ds_vehicle['distance_circularity'].isnull()][missing_values_cols]
#Since there is more than 1 missing values in 207 row, we drop it and replace rest of the missing values in distance_circularity by median
ds_vehicle.drop(207, inplace=True)
#As observed, row 35 and 319 are of different categories so we are going to replace the missing values by their individual medians
ds_vehicle.loc[35]=ds_vehicle.loc[35].replace(np.nan,ds_vehicle['distance_circularity'][ds_vehicle['class']==2].median())
ds_vehicle.loc[319]=ds_vehicle.loc[319].replace(np.nan,ds_vehicle['distance_circularity'][ds_vehicle['class']==0].median())
#Treating missing values for radius_ratio
ds_vehicle[ds_vehicle['radius_ratio'].isnull()][missing_values_cols]
#Since there is no other missing values in other attribute, we will not drop any. Replacing the missing values with the respective categories of vehicles.
ds_vehicle.loc[[9,78,159,287,345,467]]['class']
#From the data we observe that :
#Row 9,159, 467 are of category 1
#Row 78, 345 are of category 0
#Row 387 is of category 2
#Replacing missing values according to the category
ds_vehicle.loc[[9,159,467]]=ds_vehicle.loc[[9,159,467]].replace(np.nan,ds_vehicle['radius_ratio'][ds_vehicle['class']==1].median())
ds_vehicle.loc[[78,345 ]]=ds_vehicle.loc[[ 78,345 ]].replace(np.nan,ds_vehicle['radius_ratio'][ds_vehicle['class']==0].median())
ds_vehicle.loc[287]=ds_vehicle.loc[287].replace(np.nan,ds_vehicle['radius_ratio'][ds_vehicle['class']==2].median())
#Treating missing values for pr.axis_aspect_ratio
ds_vehicle[ds_vehicle['pr.axis_aspect_ratio'].isnull()][missing_values_cols]
#Dropping row 222 since it has more than 1 NaN value, and replacing the remaining with median of the same class
# drop row 222
ds_vehicle.drop(222, inplace=True)
#From data we observe that row 19 is of class 1 ; so finding out median of "pr.axis_aspect_ratio" for class 1 and replacing it
ds_vehicle.loc[19]=ds_vehicle.loc[19].replace(np.nan,ds_vehicle['pr.axis_aspect_ratio'][ds_vehicle['class']==1].median())
#Treating scatter_ratio
ds_vehicle[ds_vehicle['scatter_ratio'].isnull()][missing_values_cols]
#Since there are multiple attributes with NaN values, we are going to drop this row.
ds_vehicle.drop(249,inplace=True)
#Treating missing value for elongatedness
ds_vehicle[ds_vehicle['elongatedness'].isnull()][missing_values_cols]
ds_vehicle.loc[215]['class']
#From data, we observe the class for this element is 1, so finding out median of class 1 and replacing it as missing value for elongatedness
ds_vehicle.loc[215]=ds_vehicle.loc[215].replace(np.nan,ds_vehicle['elongatedness'][ds_vehicle['class']==1].median())
#Treating missing values for pr.axis_rectangularity
ds_vehicle[ds_vehicle['pr.axis_rectangularity'].isnull()][missing_values_cols]
#Since there is no multiple missing attributes , we are going to replace the missing values with class according median
#Class of the missing values
ds_vehicle.loc[[70,237,273]]['class']
ds_vehicle.loc[70]=ds_vehicle.loc[70].replace(np.nan,ds_vehicle['pr.axis_rectangularity'][ds_vehicle['class']==1].median())
ds_vehicle.loc[237]=ds_vehicle.loc[237].replace(np.nan,ds_vehicle['pr.axis_rectangularity'][ds_vehicle['class']==0].median())
ds_vehicle.loc[273]=ds_vehicle.loc[273].replace(np.nan,ds_vehicle['pr.axis_rectangularity'][ds_vehicle['class']==2].median())
#Treating scaled variance
ds_vehicle[ds_vehicle['scaled_variance'].isnull()][missing_values_cols]
#Since none of the missing rows have any other missing values, proceeding with replacing with mean based on class
#Since none of the missing rows have any other missing values, proceeding with replacing with mean based on class
ds_vehicle.loc[[372,522]]['class']
ds_vehicle.loc[372]=ds_vehicle.loc[372].replace(np.nan,ds_vehicle['scaled_variance'][ds_vehicle['class']==2].median())
ds_vehicle.loc[522]=ds_vehicle.loc[522].replace(np.nan,ds_vehicle['scaled_variance'][ds_vehicle['class']==1].median())
#Treating scaled_variance.1
ds_vehicle[ds_vehicle['scaled_variance.1'].isnull()][missing_values_cols]
#From the data, we observe both the rows have the same class, so replacing with the median
ds_vehicle.loc[[308,496]]=ds_vehicle.loc[[ 308,496]].replace(np.nan,ds_vehicle['scaled_variance.1'][ds_vehicle['class']==1].median())
#Treating dataframe for missing values in scaled_radius_of_gyration.1
ds_vehicle[ds_vehicle['scaled_radius_of_gyration.1'].isnull()][missing_values_cols]
#Since row 66 has multiple missing attributes, we will drop it.
#For the rest of the rows,we proceed with median replacement based on class
ds_vehicle.drop(66, inplace=True)
ds_vehicle.loc[[77,192,329]]['class']
#Since all the rows belong to class 1 of vehicles, we find the median and replace
ds_vehicle.loc[[77,192,329]]=ds_vehicle.loc[[ 77,192,329]].replace(np.nan,ds_vehicle['scaled_radius_of_gyration.1'][ds_vehicle['class']==1].median())
#Treating data for skewness_about attribute
ds_vehicle[ds_vehicle['skewness_about'].isnull()][missing_values_cols]
ds_vehicle.loc[[141,177,285]]['class']
ds_vehicle.loc[[141,177]]=ds_vehicle.loc[[141,177]].replace(np.nan,ds_vehicle['skewness_about'][ds_vehicle['class']==0].median())
ds_vehicle.loc[[285]]=ds_vehicle.loc[[285]].replace(np.nan,ds_vehicle['skewness_about'][ds_vehicle['class']==1].median())
#Treating dataframe for missing value in skewness_about.1
ds_vehicle[ds_vehicle['skewness_about.1'].isnull()][missing_values_cols]
#No missing values as we have already dropped the corresponding row in our earlier treatments.
#Treating data for skewness_about.2
ds_vehicle[ds_vehicle['skewness_about.2'].isnull()][missing_values_cols]
#From the data, we observe it is of the class 1
ds_vehicle.loc[[419]]=ds_vehicle.loc[[419]].replace(np.nan,ds_vehicle['skewness_about.2'][ds_vehicle['class']==1].median())
#Replacing the data with the median
#Checking for the missing values after treatment
ds_vehicle.isnull().sum()
ds_vehicle[ds_vehicle.isnull().any(axis=1)][missing_values_cols].shape
#Confirming that no NaN values are there in dataset
ds_vehicle["class"].value_counts()
# 5 M's of the data
ds_vehicle.describe()
#Visualizing the dataframe
sns.pairplot(ds_vehicle,diag_kind="kde",hue="class")
ds_vehicle.corr()
#Correlation Matrix
corr = ds_vehicle.corr() # correlation matrix
#sns.set_style(style = 'white') # Setting it to white so that we do not see the grid lines
#plt.figure(figsize=(15,8))
#sns.set_context(font_scale=0.8, rc={"lines.linewidth": 2.0})
#ax = sns.heatmap(
# corr,
# annot=True,
# fmt = '.1g'
#)
#ax.set_xticklabels(
# ax.get_xticklabels(),
# rotation=45,
# horizontalalignment='right'
#);
lower_triangle = np.tril(corr, k = -1) # select only the lower triangle of the correlation matrix
mask = lower_triangle == 0 # to mask the upper triangle in the following heatmap
plt.figure(figsize = (15,8)) # setting the figure size
sns.set_style(style = 'white') # Setting it to white so that we do not see the grid lines
sns.heatmap(lower_triangle, center=0.5, cmap= 'Blues', annot= True, xticklabels = corr.index, yticklabels = corr.columns,
cbar= False, linewidths= 1, mask = mask) # Da Heatmap
plt.xticks(rotation = 50) # Aesthetic purposes
plt.yticks(rotation = 20) # Aesthetic purposes
plt.show()
Spread of compactness is least for van. mean compactness is highest for car. For Bus compactness is right skewed indicating that less number of buses have high compactness.
Mean circularity is higher for cars
Mean distance_circularity is also higher for cars
Mean radius_ratio is higher for cars, followed by Bus. It is least for vans pr.axis_aspect_ratio is has almost same distribution for car, van and buses max.length_aspect_ratio is almost same for cars and vans, lower for buses
Mean scatter ratio is highest for cars, followed by bus and van
Mean elomngatedness is highest for vans folowed by bus and car pr.axis_rectangularity is highest for cars, followed by bus and then vans distribution of max.length_rectangularity is almost same for cars, bus and vans
Mean scaled variance is highest for cars followed by bus then vans
Mean scaled variance1 is highest for cars followed by bus then vans 'scaled_radius_of_gyration', 'scaled_radius_of_gyration.1', 'skewness_about', 'skewness_about.1', 'skewness_about.2', have almost similar distribution for cars, buses and vans. 'hollows_ratio' is lower for buses as compared to cars and vans
Many columns have lonmg tails indicating outliers pr.axis_aspect ratio and radius ratio varies strongly +ve for van. for cars and buses it varies in small range- mostly cpuld like Scatter ratio & Scaled_variance1 has almost perfect positive linear relationship
Many features show high correlation indicating that we need to drop multiple features : USING PCA WILL HELP IN REDUCING THE VARIABLES
df_train, df_test= train_test_split(ds_vehicle, test_size = 0.3, random_state = 1)
df_train.shape, df_test.shape
#We will handle outliers in training data and leave in testing data.
# 1. Outliers if close to max value will be replaced with max value of the corresponding class
# 2. If outliers are much above 75% quantile range: mean+2SD, we drop them
# 3. Outlier if close to min value will be replaced by min value of the corresponding class
# 4. If outliers are much lower than 25% quantile range :mean-2SD, we drop them
def outlierCheck(inputSeries):
q1 = inputSeries.quantile(0.25)
q3 = inputSeries.quantile(0.75)
iqr = q3-q1 #Interquartile range
low_range = q1-1.5*iqr
high_range = q3+1.5*iqr
outliers_low = inputSeries[(inputSeries < low_range)]
outliers_high= inputSeries[(inputSeries > high_range)]
print ("25th Quantile value: ", q1)
print('Outlier low Count =', outliers_low.count())
print('List of Low outliers: \n')
print(outliers_low)
print ("75th Quantile value: ", q3)
print('Outlier High Count = ', outliers_high.count())
print('List of High outliers: \n')
print(outliers_high)
#Checking outliers in Compactness attribute
sns.boxplot(df_train['compactness'])
#We can see 1 value that is approx to be 120. Since this seems to be very close to the whisker edge, we are keeping it as is.
#Cheking outliers in circularity
sns.boxplot(df_train["circularity"])
#No outliers in this attribute
sns.boxplot(df_train["distance_circularity"])
#As observed, the plot is skewed towards the right, so we can safely say data is more distributed towards the right of mean
sns.boxplot(df_train["radius_ratio"])
#There are outliers. So we analyze the data based on our logic defined above and take appropriate decison
outlierCheck(df_train["radius_ratio"])
#Checking the full rows
df_train.loc[[37,388]]
#Since both of these are from the same category, we will replace both these values with max'm value acceptable for VAN class
#The maximum value accepted without making as an outlier is 250 (as seen from the plot) ; therefore
df_train.loc[[37,388],"radius_ratio"]=250.0
sns.boxplot(df_train["pr.axis_aspect_ratio"])
#Let us check the outlier in depth
outlierCheck(df_train["pr.axis_aspect_ratio"])
df_train.loc[[4,37,100,291,388]]
#Since the values are distributed amongst 2 classes, we will check their values
print("Values for class 0 \n")
df_train[df_train["class"]==0]["pr.axis_aspect_ratio"].sort_values( ascending=False).head(5)
#Since the maxm values are within the whisker 76, we are dropping 100 and 4
df_train.drop([4,100], inplace=True)
print("Values for class 2 \n")
df_train[df_train["class"]==2]["pr.axis_aspect_ratio"].sort_values( ascending=False).head(10)
#Again since there is a big drop from 70 to 102, so we are dropping these values
df_train.drop([388,37,291], inplace=True)
sns.boxplot(df_train["max.length_aspect_ratio"])
outlierCheck(df_train["max.length_aspect_ratio"])
#Checking the rows for these outliers :
df_train.loc[[391,127]]
#Checking values for class 2 for max.length_aspect_ratio attribute
df_train[df_train['class']==2]['max.length_aspect_ratio'].sort_values( ascending=False).head(10)
#There is a major up from 12-25 so dropping these values
df_train.drop(391,inplace=True)
print("Now values for class 0")
df_train[df_train['class']==0]['max.length_aspect_ratio'].sort_values( ascending=False).head(10)
#There is another drop from 8-22 so dropping that value
df_train.drop(127,inplace=True)
sns.boxplot(df_train["scatter_ratio"])
#No visible outlier
sns.boxplot(df_train["elongatedness"])
#No outlier present
sns.boxplot(df_train["pr.axis_rectangularity"])
#No outlier present
sns.boxplot(df_train["max.length_rectangularity"])
#No outlier present
sns.boxplot(df_train["scaled_variance"])
#No outlier present
sns.boxplot(df_train["scaled_variance.1"])
#As we can see there is 1 outlier
#Checking outlier in scaled.variance.1
outlierCheck(df_train["scaled_variance.1"])
df_train.loc[[835]]
df_train[df_train['class']==0]['scaled_variance.1'].sort_values( ascending=False).head(8)
df_train.drop(835,inplace=True)
sns.boxplot(df_train["scaled_radius_of_gyration.1"])
outlierCheck(df_train["scaled_radius_of_gyration.1"])
#As we observe, most of the outliers are present under value 90 and quite near the whisker of 87.
#Hence we are leaving them as is.
sns.boxplot(df_train["skewness_about"])
outlierCheck(df_train['skewness_about'])
df_train.loc[[516,505,44,797,623,400]]
#Since all of these belong to class 1, we will find out the whisker of these values by
df_train[df_train['class']==1]['skewness_about'].sort_values( ascending=False).head(20)
#We will replace these values by the nearest whisker value : 18
arr =[516,505,44,797,623,400]
#df_train["skewness_about"].replace(18)
df_train.loc[516]["skewness_about"]=18
df_train.loc[505]["skewness_about"]=18
df_train.loc[44]["skewness_about"]=18
df_train.loc[797]["skewness_about"]=18
df_train.loc[623]["skewness_about"]=18
df_train.loc[400]["skewness_about"]=18
sns.boxplot(df_train["skewness_about.1"])
#Since the outlier is neaer to the whisker, we will let it be.
sns.boxplot(df_train["skewness_about.2"])
outlierCheck(df_train["skewness_about.2"])
df_train.drop(516,inplace=True)
sns.boxplot(df_train['hollows_ratio'])
#No outlier
#Final shape of training data
df_train.shape
dropping_labels=["class","elongatedness","scaled_radius_of_gyration.1","hollows_ratio","skewness_about.1","pr.axis_aspect_ratio"]
X_train=df_train.drop(dropping_labels, axis=1)
y_train=df_train['class']
X_test=df_test.drop(dropping_labels, axis=1)
y_test=df_test['class']
X_train.shape,y_train.shape, X_test.shape, y_test.shape
from sklearn.svm import SVC
# Building a Support Vector Machine on train data
svc_model = SVC(C= .1, kernel='linear', gamma= 1)
svc_model.fit(X_train, y_train)
prediction = svc_model .predict(X_test)
# check the accuracy on the training set
print(svc_model.score(X_train, y_train))
print(svc_model.score(X_test, y_test))
print("Confusion Matrix:\n",confusion_matrix(prediction,y_test))
dropping_labels=["class","elongatedness","scaled_radius_of_gyration.1","hollows_ratio","skewness_about.1","pr.axis_aspect_ratio"]
X_train_PCA=df_train.drop(dropping_labels, axis=1)
y_train_PCA=df_train['class']
X_test_PCA=df_test.drop(dropping_labels, axis=1)
y_test_PCA=df_test['class']
X_train_PCA.shape,y_train_PCA.shape, X_test_PCA.shape, y_test_PCA.shape
sc = StandardScaler()
sc.fit(X_train_PCA) # Fit scaler in train set
# transform train set
#Transform X_train
X_train_std=sc.transform(X_train_PCA)
#Transform X_test ( with same fit as train) to prevent data leak
X_test_std=sc.transform(X_test_PCA)
# Covariance Matrix
cov_matrix = np.cov(X_train_std.T)
print('Covariance Matrix \n%s', cov_matrix)
eig_vals, eig_vecs = np.linalg.eig(cov_matrix)
print('Eigen Vectors \n%s', eig_vecs)
print('\n Eigen Values \n%s', eig_vals)
print("Eigen Values:")
pd.DataFrame(eig_vals).transpose()
tot = sum(eig_vals)
var_exp = [( i /tot ) * 100 for i in sorted(eig_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp) # array of size = as many PC dimensions
print("Cumulative Variance Explained", cum_var_exp)
# Ploting
plt.figure(figsize=(15 , 6))
plt.bar(range(1, eig_vals.size + 1), var_exp, alpha = 0.5, align = 'center', label = 'Individual explained variance')
plt.step(range(1, eig_vals.size + 1), cum_var_exp, where='mid', label = 'Cumulative explained variance')
plt.ylabel('Explained Variance Ratio')
plt.xlabel('Principal Components')
plt.legend(loc = 'best')
plt.tight_layout()
plt.show()
# Make a set of (eigenvalue, eigenvector) pairs
eig_pairs = [(eig_vals[index], eig_vecs[index]) for index in range(len(eig_vals))]
# Sort the (eigenvalue, eigenvector) pairs from highest to lowest with respect to eigenvalue by default take first field for sorting
eig_pairs.sort(reverse=True)
# Note: always form pair of eigen vector and values first before sorting...
# Extract the descending ordered eigenvalues and eigenvectors
eigvalues_sorted = [eig_pairs[index][0] for index in range(len(eig_vals))]
eigvectors_sorted = [eig_pairs[index][1] for index in range(len(eig_vals))]
#Dimesionality reduction
P_reduce = np.array(eigvectors_sorted[0:8]).transpose() # Selecting first 8 eigen vector out if 18
Proj_train_data = np.dot(X_train_std,P_reduce) # projecting training data onto the eight eigen vectors
Proj_test_data = np.dot(X_test_std,P_reduce) # projecting test data onto the eight eigen vectors
#Check shapes of train and test new feature and target set after PCA
Proj_train_data.shape,y_train.shape,Proj_test_data.shape,y_test.shape
# Use SVM
from sklearn.svm import SVC
# Building a Support Vector Machine on train data
svc_model = SVC(C= .1, kernel='linear', gamma= 1)
svc_model.fit(Proj_train_data, y_train)
prediction = svc_model.predict(Proj_test_data)
# check the accuracy on the training set
print(svc_model.score(Proj_train_data, y_train))
print(svc_model.score(Proj_test_data, y_test))
print("Confusion Matrix:\n",confusion_matrix(prediction,y_test))
# Building a Support Vector Machine on train data
svc_model = SVC(kernel='rbf')
svc_model.fit(Proj_train_data, y_train_PCA)
prediction = svc_model.predict(Proj_test_data)
print(svc_model.score(Proj_train_data, y_train_PCA))
print(svc_model.score(Proj_test_data, y_test_PCA))
print("Confusion Matrix:\n",confusion_matrix(prediction,y_test_PCA))